# 预处理，pipline

import pandas as pd
import numpy as np
import jieba
import re
from sklearn.model_selection import train_test_split
from gensim.models.word2vec import Word2Vec
from tensorflow import keras
from nlu_model.util.pkl_impl import save_pkl, load_pkl

# data preprocess
def loadfile():
    # 加载并预处理模型
    neg = pd.read_excel('./data/cls/shopping_reviews/neg.xls', header=None, index=None)
    pos = pd.read_excel('./data/cls/shopping_reviews/pos.xls', header=None, index=None)

    def cw(x): 
        punctuation = r"[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）：]+"
        x = re.sub(punctuation, "", x)

        return list(jieba.cut(x))
    pos['words'] = pos[0].apply(cw)
    neg['words'] = neg[0].apply(cw)

    y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))

    x_train, x_test, y_train, y_test = train_test_split(
        np.concatenate((pos['words'], neg['words'])), y, test_size=0.2)
    
    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = loadfile()
with open("./data/cls/shopping_reviews/train.txt", "w") as f:
    for idx in range(len(x_train)):
        f.write("%s\t%s\n" % (y_train[idx], " ".join(x_train[idx])))

with open("./data/cls/shopping_reviews/test.txt", "w") as f:
    for idx in range(len(x_test)):
        f.write("%s\t%s\n" % (y_test[idx], " ".join(x_test[idx])))

# pretrain_w2v
def pretrain_w2v(x_train):
    N_DIM = 300                         # word2vec的数量
    MIN_COUNT = 5                       # 保证出现的词数足够做才进入词典
    w2v_EPOCH = 15                      # w2v的训练迭代次数
    MAXLEN = 50                         # 句子最大长度
    # Initialize model and build vocab
    imdb_w2v = Word2Vec(size=N_DIM, min_count=MIN_COUNT)
    imdb_w2v.build_vocab(x_train)

    # Train the model over train_reviews (this may take several minutes)
    imdb_w2v.train(x_train, total_examples=len(x_train), epochs=w2v_EPOCH)

    # imdb_w2v.save('./data/ptm/shopping_reviews/w2v_model2020100601.pkl')
    print("model train done")

    # word2vec后处理
    n_symbols = len(imdb_w2v.wv.vocab.keys()) + 2
    embedding_weights = [[0 for i in range(N_DIM)] for i in range(n_symbols)]
    np.zeros((n_symbols, 300))
    idx = 1
    word2idx_dic = {}
    w2v_model_metric = []
    for w in imdb_w2v.wv.vocab.keys():
        embedding_weights[idx] = imdb_w2v[w]
        word2idx_dic[w] = idx
        idx = idx + 1

    # 留给未登录词的位置
    avg_weights = [0 for i in range(N_DIM)]
    for wd in word2idx_dic:
        avg_weights = [(avg_weights[idx]+embedding_weights[word2idx_dic[wd]][idx]) for idx in range(N_DIM)]
    avg_weights = [avg_weights[idx] / len(word2idx_dic) for idx in range(N_DIM)]
    embedding_weights[idx] = avg_weights
    word2idx_dic["<UNK>"] = idx

    # 留给pad的位置
    word2idx_dic["<PAD>"] = 0

    # 保存w2id词典
    save_pkl('./data/ptm/shopping_reviews/w2v_word2idx2020100601.pkl', word2idx_dic)
    with open('./data/ptm/shopping_reviews/w2v_word2idx2020100601.txt', 'w') as f:
        for item in word2idx_dic:
            f.write("%s\t%s\n" % (item, word2idx_dic[item]))

    # 保存词向量矩阵
    save_pkl("./data/ptm/shopping_reviews/w2v_model_metric_2020100601.pkl", embedding_weights)
    with open("./data/ptm/shopping_reviews/w2v_model_metric_2020100601.txt", "w") as f:
        for line in embedding_weights:
            f.write("%s/n" % (",".join([str(i) for i in line])))

    save_pkl("./data/ptm/shopping_reviews/w2v_model_conf_2020100601.pkl", [N_DIM, MIN_COUNT, w2v_EPOCH, MAXLEN])

    return np.array(embedding_weights),imdb_w2v, word2idx_dic

embedding_weights,imdb_w2v, word2idx_dic = pretrain_w2v(x_train)

def word2idx(source_data, word2idx_dic):
    result_data = []
    for idx in range(len(source_data)):
        sentence = []
        for item in source_data[idx]:
            if item in word2idx_dic:
                sentence.append(word2idx_dic[item])
            else:
                sentence.append(len(word2idx_dic)-1)
        result_data.append(sentence)
    return result_data

print(x_train[0])
x_train = word2idx(x_train, word2idx_dic)
x_test = word2idx(x_test, word2idx_dic)
print(x_train[0])

# data_preprocess
x_train = keras.preprocessing.sequence.pad_sequences(x_train,
                                                    value=0,
                                                    padding='post',
                                                    maxlen=50)
print(x_train[0])
x_test = keras.preprocessing.sequence.pad_sequences(x_test,
                                                    value=0,
                                                    padding='post',
                                                    maxlen=50)

# model
print(type(embedding_weights))
print(type(embedding_weights[0]))
inputs = keras.layers.Input(shape=(50,))
embedding_layer = keras.layers.Embedding(output_dim = 300, # 词向量 长度（100）
                            input_dim = len(embedding_weights), # 字典长度
                            weights=[embedding_weights], # 重点：预训练的词向量系数
                            input_length=50, # 每句话的 最大长度（必须padding） 
                            trainable=True # 是否在 训练的过程中 更新词向量
                            )
x = embedding_layer(inputs)

l_conv1 = keras.layers.Conv1D(filters=300, kernel_size=3, activation='relu')(x)  #现长度 = 1+（原长度-卷积核大小+2*填充层大小） /步长 卷积核的形状（fsz，embedding_size）
l_pool1 = keras.layers.MaxPool1D(pool_size=3)(l_conv1)  # 这里面最大的不同 池化层核的大小与卷积完的数据长度一样
l_pool11 = keras.layers.Flatten()(l_pool1)    #一般为卷积网络最近全连接的前一层，用于将数据压缩成一维

out = keras.layers.Dropout(0.5)(l_pool11)
output = keras.layers.Dense(32, activation='relu')(out)
 
pred = keras.layers.Dense(units=1, activation='sigmoid')(output)
 
model = keras.models.Model(inputs=inputs, outputs=pred)
# adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.summary()
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])

history = model.fit(x_train, y_train, batch_size=64,
                    epochs=3,
                    validation_data=(x_test, y_test),
                    verbose=1)
# evalute model
results = model.evaluate(x_test, y_test)
print(results)

sentence = list(jieba.cut("这台手机真性能还挺好的"))
sentence_id = np.zeros(50)
for idx in range(len(sentence)):
    if sentence[idx] in word2idx_dic:
        sentence_id[idx] = word2idx_dic[sentence[idx]]
    else:
        sentence_id[idx] = len(word2idx_dic)-1
if len(sentence_id) > 50:
    sentence_id = sentence_id[:50]
while True:
    if len(sentence_id) < 50:
        sentence_id.append(0)
    else:
        break
print(len(sentence_id))
print(sentence_id)
print(model.predict(np.array([sentence_id])))

